#install.packages("dplyr") # to manipulate gene expression data
#install.packages("tidyverse")
#install.packages("FactoMineR")
#install.packages("factoextra")
#install.packages("ggplot2") # to visualise and make graphs
#install.packages("pheatmap")
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("GEOquery")
#BiocManager::install("DESeq2")
#BiocManager::install("EnhancedVolcano")
#BiocManager::install('PCAtools')
#For GSEA
#BiocManager::install("msigdbr")
#BiocManager::install("clusterProfiler")
#BiocManager::install("fgsea", force = TRUE)
#BiocManager::install("org.Hs.eg.db")
library("dplyr")
library("tidyverse")
library("GEOquery")
library("FactoMineR")
library("factoextra")
library("ggplot2")
library("DESeq2")
library("EnhancedVolcano")
library("pheatmap")
library("PCAtools")
library("airway")
#For GSEA
library(tidyverse) #Data manipulation
library(msigdbr) #Broad gene set database
library(clusterProfiler) #Hypergeo enrichment
library(fgsea) #GSEA
library(enrichplot)
The objective of this assignment is to explore a publicly available transcriptomics dataset (bulk RNASeq). Biopsies were collected from multiple regions of the colon from 12 ulcerative colitis (UC) patients. Four biopsies were collected from each patient, from inflamed and non-inflamed areas of the colon.
data <- read.delim("GSE107593_raw_reads_BCHRNAseq.txt")
head(data)
## Row Type gene_name gene_type chr start end
## 1 ENSG00000000003.14 Ensembl TSPAN6 protein_coding chrX 100627109 100639991
## 2 ENSG00000000005.5 Ensembl TNMD protein_coding chrX 100584802 100599885
## 3 ENSG00000000419.12 Ensembl DPM1 protein_coding chr20 50934867 50958555
## 4 ENSG00000000457.13 Ensembl SCYL3 protein_coding chr1 169849631 169894267
## 5 ENSG00000000460.16 Ensembl C1orf112 protein_coding chr1 169662007 169854080
## 6 ENSG00000000938.12 Ensembl FGR protein_coding chr1 27612064 27635277
## strand length X877.2 X1057.1 X1057.2 X1077.1 X1077.2 X8878B1 X8878C1 X8881A1
## 1 - 4535 1834 1795 3253 1505 1610 1651 931 1068
## 2 + 1610 20 34 59 19 31 18 12 20
## 3 - 1207 773 541 1014 539 675 619 478 443
## 4 - 6883 512 435 688 351 419 523 399 336
## 5 + 5967 161 77 151 115 122 144 89 54
## 6 - 3474 42 11 21 40 72 147 192 42
## X8881B1 X157.6 X1214.3 X1214.4 X8854D1 X8854D2 X8855A2 X8855A3 X8855D1
## 1 2398 421 1670 1789 945 1406 975 968 1555
## 2 39 0 15 16 9 3 15 13 10
## 3 1373 550 576 653 431 410 813 1244 603
## 4 1004 339 398 425 366 275 265 594 424
## 5 218 101 107 90 136 78 223 284 148
## 6 85 794 69 57 231 131 136 355 79
## X8855D2 X8874D1 X8874F2 X157.3 X157.4 X877.3 X1057.4 X1057.6 X1192.5 X1192.6
## 1 1477 1723 1424 1578 1638 1230 479 387 549 1203
## 2 7 11 5 10 9 0 1 1 1 6
## 3 722 778 598 739 742 470 648 516 662 770
## 4 377 511 398 482 468 382 408 313 354 500
## 5 185 177 159 129 157 123 170 85 181 193
## 6 97 309 107 63 108 235 421 457 434 393
## X877.6 X1077.1.1 X1077.4 X1214.4.1 X1214.6 X8874A1 X8874B1 X8879.E2 X8879D1
## 1 1068 1103 2220 806 1048 2277 1757 385 526
## 2 4 0 0 3 3 18 20 2 5
## 3 448 446 1145 593 581 681 489 457 720
## 4 395 269 840 315 361 533 413 359 480
## 5 127 82 279 129 133 100 95 184 253
## 6 249 153 617 556 627 45 57 549 738
## X8881.E2 X8881F2 X157.3.1 X877.4 X1192.1 X1192.3 X8854A1 X8854A3 X8878.E4
## 1 719 615 835 1704 1906 2362 2041 2400 1135
## 2 8 2 5 22 17 24 26 27 7
## 3 526 431 526 588 1022 1558 636 820 494
## 4 461 286 373 449 501 742 565 697 416
## 5 116 79 98 115 231 383 130 152 117
## 6 460 221 225 34 137 267 36 47 329
## X8878D2 X8879A1 X8879B1
## 1 1478 1382 1213
## 2 8 9 10
## 3 671 493 450
## 4 427 399 364
## 5 157 106 81
## 6 373 73 52
nrow = nrow(data)
ncol = ncol(data)
cat("Number of rows :", nrow, "\n")
## Number of rows : 60498
cat("Number of columns :", ncol)
## Number of columns : 57
length(unique(data$gene_name))
## [1] 58579
The number of gene_names is far lower that the number of Ensembl Id.
glimpse(data)
## Rows: 60,498
## Columns: 57
## $ Row <chr> "ENSG00000000003.14", "ENSG00000000005.5", "ENSG00000000419.…
## $ Type <chr> "Ensembl", "Ensembl", "Ensembl", "Ensembl", "Ensembl", "Ense…
## $ gene_name <chr> "TSPAN6", "TNMD", "DPM1", "SCYL3", "C1orf112", "FGR", "CFH",…
## $ gene_type <chr> "protein_coding", "protein_coding", "protein_coding", "prote…
## $ chr <chr> "chrX", "chrX", "chr20", "chr1", "chr1", "chr1", "chr1", "ch…
## $ start <int> 100627109, 100584802, 50934867, 169849631, 169662007, 276120…
## $ end <int> 100639991, 100599885, 50958555, 169894267, 169854080, 276352…
## $ strand <chr> "-", "+", "-", "-", "+", "-", "+", "-", "-", "+", "-", "+", …
## $ length <int> 4535, 1610, 1207, 6883, 5967, 3474, 8145, 2793, 8463, 3811, …
## $ X877.2 <int> 1834, 20, 773, 512, 161, 42, 772, 1536, 1338, 543, 363, 1093…
## $ X1057.1 <int> 1795, 34, 541, 435, 77, 11, 523, 1097, 897, 420, 350, 1101, …
## $ X1057.2 <int> 3253, 59, 1014, 688, 151, 21, 1534, 1932, 1583, 856, 529, 18…
## $ X1077.1 <int> 1505, 19, 539, 351, 115, 40, 481, 1063, 975, 470, 313, 923, …
## $ X1077.2 <int> 1610, 31, 675, 419, 122, 72, 563, 1246, 1134, 508, 361, 947,…
## $ X8878B1 <int> 1651, 18, 619, 523, 144, 147, 832, 1422, 960, 573, 304, 1201…
## $ X8878C1 <int> 931, 12, 478, 399, 89, 192, 1089, 975, 760, 424, 213, 899, 4…
## $ X8881A1 <int> 1068, 20, 443, 336, 54, 42, 300, 960, 682, 295, 249, 843, 32…
## $ X8881B1 <int> 2398, 39, 1373, 1004, 218, 85, 1157, 2480, 2194, 918, 724, 2…
## $ X157.6 <int> 421, 0, 550, 339, 101, 794, 2073, 1393, 1091, 603, 147, 492,…
## $ X1214.3 <int> 1670, 15, 576, 398, 107, 69, 677, 1277, 862, 391, 408, 916, …
## $ X1214.4 <int> 1789, 16, 653, 425, 90, 57, 714, 1433, 967, 492, 418, 1018, …
## $ X8854D1 <int> 945, 9, 431, 366, 136, 231, 1063, 854, 493, 511, 203, 748, 4…
## $ X8854D2 <int> 1406, 3, 410, 275, 78, 131, 841, 917, 485, 436, 213, 807, 35…
## $ X8855A2 <int> 975, 15, 813, 265, 223, 136, 2131, 1172, 1289, 699, 153, 431…
## $ X8855A3 <int> 968, 13, 1244, 594, 284, 355, 3996, 1675, 2447, 1134, 339, 1…
## $ X8855D1 <int> 1555, 10, 603, 424, 148, 79, 1089, 1191, 1258, 539, 297, 908…
## $ X8855D2 <int> 1477, 7, 722, 377, 185, 97, 1253, 1260, 1232, 534, 332, 795,…
## $ X8874D1 <int> 1723, 11, 778, 511, 177, 309, 1126, 1296, 848, 690, 307, 113…
## $ X8874F2 <int> 1424, 5, 598, 398, 159, 107, 891, 1210, 711, 504, 178, 829, …
## $ X157.3 <int> 1578, 10, 739, 482, 129, 63, 967, 1658, 847, 559, 296, 995, …
## $ X157.4 <int> 1638, 9, 742, 468, 157, 108, 1145, 1659, 962, 615, 286, 906,…
## $ X877.3 <int> 1230, 0, 470, 382, 123, 235, 1252, 953, 617, 523, 147, 821, …
## $ X1057.4 <int> 479, 1, 648, 408, 170, 421, 2775, 1505, 1289, 537, 153, 761,…
## $ X1057.6 <int> 387, 1, 516, 313, 85, 457, 2508, 1245, 1023, 361, 104, 548, …
## $ X1192.5 <int> 549, 1, 662, 354, 181, 434, 1497, 1054, 759, 625, 118, 757, …
## $ X1192.6 <int> 1203, 6, 770, 500, 193, 393, 2512, 1510, 1030, 857, 236, 117…
## $ X877.6 <int> 1068, 4, 448, 395, 127, 249, 996, 895, 619, 478, 129, 783, 4…
## $ X1077.1.1 <int> 1103, 0, 446, 269, 82, 153, 1552, 914, 549, 457, 122, 533, 3…
## $ X1077.4 <int> 2220, 0, 1145, 840, 279, 617, 4492, 2166, 1790, 1379, 272, 2…
## $ X1214.4.1 <int> 806, 3, 593, 315, 129, 556, 2205, 1296, 879, 457, 170, 645, …
## $ X1214.6 <int> 1048, 3, 581, 361, 133, 627, 2907, 1586, 894, 486, 187, 836,…
## $ X8874A1 <int> 2277, 18, 681, 533, 100, 45, 868, 1455, 1135, 506, 329, 1291…
## $ X8874B1 <int> 1757, 20, 489, 413, 95, 57, 780, 1165, 1029, 500, 306, 1197,…
## $ X8879.E2 <int> 385, 2, 457, 359, 184, 549, 1130, 830, 775, 514, 133, 555, 5…
## $ X8879D1 <int> 526, 5, 720, 480, 253, 738, 1631, 1183, 1209, 707, 244, 874,…
## $ X8881.E2 <int> 719, 8, 526, 461, 116, 460, 1722, 1300, 1754, 566, 183, 934,…
## $ X8881F2 <int> 615, 2, 431, 286, 79, 221, 1085, 1064, 1031, 372, 201, 642, …
## $ X157.3.1 <int> 835, 5, 526, 373, 98, 225, 1349, 1087, 870, 447, 222, 633, 4…
## $ X877.4 <int> 1704, 22, 588, 449, 115, 34, 542, 1347, 1258, 519, 333, 996,…
## $ X1192.1 <int> 1906, 17, 1022, 501, 231, 137, 1761, 1761, 1325, 894, 571, 1…
## $ X1192.3 <int> 2362, 24, 1558, 742, 383, 267, 2822, 2254, 1914, 1375, 610, …
## $ X8854A1 <int> 2041, 26, 636, 565, 130, 36, 527, 1256, 886, 589, 409, 1484,…
## $ X8854A3 <int> 2400, 27, 820, 697, 152, 47, 749, 1659, 1138, 723, 484, 1528…
## $ X8878.E4 <int> 1135, 7, 494, 416, 117, 329, 1893, 1300, 1125, 564, 234, 103…
## $ X8878D2 <int> 1478, 8, 671, 427, 157, 373, 2284, 1526, 807, 718, 225, 976,…
## $ X8879A1 <int> 1382, 9, 493, 399, 106, 73, 592, 977, 799, 427, 328, 690, 44…
## $ X8879B1 <int> 1213, 10, 450, 364, 81, 52, 546, 921, 859, 402, 370, 804, 38…
Converting the variables from chr to factor
data$Row <- factor (data$Row)
data$strand <- factor (data$strand)
data$Type <- factor (data$Type)
data$gene_type <- factor (data$gene_type)
data$chr <- factor (data$chr)
And see the different levels of the categorical variables.
levels(data$strand)
## [1] "-" "+"
levels(data$Type)
## [1] "Ensembl"
levels(data$gene_type)
## [1] "3prime_overlapping_ncrna" "antisense"
## [3] "IG_C_gene" "IG_C_pseudogene"
## [5] "IG_D_gene" "IG_J_gene"
## [7] "IG_J_pseudogene" "IG_V_gene"
## [9] "IG_V_pseudogene" "lincRNA"
## [11] "macro_lncRNA" "miRNA"
## [13] "misc_RNA" "Mt_rRNA"
## [15] "Mt_tRNA" "polymorphic_pseudogene"
## [17] "processed_pseudogene" "processed_transcript"
## [19] "protein_coding" "pseudogene"
## [21] "ribozyme" "rRNA"
## [23] "scaRNA" "sense_intronic"
## [25] "sense_overlapping" "snoRNA"
## [27] "snRNA" "sRNA"
## [29] "TEC" "TR_C_gene"
## [31] "TR_D_gene" "TR_J_gene"
## [33] "TR_J_pseudogene" "TR_V_gene"
## [35] "TR_V_pseudogene" "transcribed_processed_pseudogene"
## [37] "transcribed_unitary_pseudogene" "transcribed_unprocessed_pseudogene"
## [39] "translated_unprocessed_pseudogene" "unitary_pseudogene"
## [41] "unprocessed_pseudogene" "vaultRNA"
levels(data$chr)
## [1] "chr1" "chr10" "chr11" "chr12" "chr13" "chr14" "chr15" "chr16" "chr17"
## [10] "chr18" "chr19" "chr2" "chr20" "chr21" "chr22" "chr3" "chr4" "chr5"
## [19] "chr6" "chr7" "chr8" "chr9" "chrM" "chrX" "chrY"
Analysis : - 9 descriptive variables : Row, Type, gene_name, gene_type, chr, start, end, strand and length - 48 samples : Four biopsies were collected from each of the 12 ulcerate colitis (UC) patients (4*12 = 48)
Preparation and selection of the columns we need
The rows correspond to the genes Ensemble Id and the columns to the samples
counts_data <- data[10:length(data)] # select only the 48 biopsies
rownames(counts_data) <- data$Row # rename the row lines, we can't use the column gene_name because it is not composed of unique values.
head(counts_data)
## X877.2 X1057.1 X1057.2 X1077.1 X1077.2 X8878B1 X8878C1
## ENSG00000000003.14 1834 1795 3253 1505 1610 1651 931
## ENSG00000000005.5 20 34 59 19 31 18 12
## ENSG00000000419.12 773 541 1014 539 675 619 478
## ENSG00000000457.13 512 435 688 351 419 523 399
## ENSG00000000460.16 161 77 151 115 122 144 89
## ENSG00000000938.12 42 11 21 40 72 147 192
## X8881A1 X8881B1 X157.6 X1214.3 X1214.4 X8854D1 X8854D2
## ENSG00000000003.14 1068 2398 421 1670 1789 945 1406
## ENSG00000000005.5 20 39 0 15 16 9 3
## ENSG00000000419.12 443 1373 550 576 653 431 410
## ENSG00000000457.13 336 1004 339 398 425 366 275
## ENSG00000000460.16 54 218 101 107 90 136 78
## ENSG00000000938.12 42 85 794 69 57 231 131
## X8855A2 X8855A3 X8855D1 X8855D2 X8874D1 X8874F2 X157.3
## ENSG00000000003.14 975 968 1555 1477 1723 1424 1578
## ENSG00000000005.5 15 13 10 7 11 5 10
## ENSG00000000419.12 813 1244 603 722 778 598 739
## ENSG00000000457.13 265 594 424 377 511 398 482
## ENSG00000000460.16 223 284 148 185 177 159 129
## ENSG00000000938.12 136 355 79 97 309 107 63
## X157.4 X877.3 X1057.4 X1057.6 X1192.5 X1192.6 X877.6
## ENSG00000000003.14 1638 1230 479 387 549 1203 1068
## ENSG00000000005.5 9 0 1 1 1 6 4
## ENSG00000000419.12 742 470 648 516 662 770 448
## ENSG00000000457.13 468 382 408 313 354 500 395
## ENSG00000000460.16 157 123 170 85 181 193 127
## ENSG00000000938.12 108 235 421 457 434 393 249
## X1077.1.1 X1077.4 X1214.4.1 X1214.6 X8874A1 X8874B1 X8879.E2
## ENSG00000000003.14 1103 2220 806 1048 2277 1757 385
## ENSG00000000005.5 0 0 3 3 18 20 2
## ENSG00000000419.12 446 1145 593 581 681 489 457
## ENSG00000000457.13 269 840 315 361 533 413 359
## ENSG00000000460.16 82 279 129 133 100 95 184
## ENSG00000000938.12 153 617 556 627 45 57 549
## X8879D1 X8881.E2 X8881F2 X157.3.1 X877.4 X1192.1 X1192.3
## ENSG00000000003.14 526 719 615 835 1704 1906 2362
## ENSG00000000005.5 5 8 2 5 22 17 24
## ENSG00000000419.12 720 526 431 526 588 1022 1558
## ENSG00000000457.13 480 461 286 373 449 501 742
## ENSG00000000460.16 253 116 79 98 115 231 383
## ENSG00000000938.12 738 460 221 225 34 137 267
## X8854A1 X8854A3 X8878.E4 X8878D2 X8879A1 X8879B1
## ENSG00000000003.14 2041 2400 1135 1478 1382 1213
## ENSG00000000005.5 26 27 7 8 9 10
## ENSG00000000419.12 636 820 494 671 493 450
## ENSG00000000457.13 565 697 416 427 399 364
## ENSG00000000460.16 130 152 117 157 106 81
## ENSG00000000938.12 36 47 329 373 73 52
gse <- getGEO(GEO = 'GSE107593', GSEMatrix = TRUE )
## Found 1 file(s)
## GSE107593_series_matrix.txt.gz
Look at the object :
gse
## $GSE107593_series_matrix.txt.gz
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 0 features, 48 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM2871805 GSM2871806 ... GSM2871852 (48 total)
## varLabels: title geo_accession ... subject:ch1 (45 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: GPL18573
Get the pheno data of the first element of that list
metadata <- pData(phenoData(gse[[1]]))
dim(metadata)
## [1] 48 45
The metadata has 48 rows and 45 variables.
head(metadata, 3)
## title geo_accession
## GSM2871805 Ulcerative colitis Colon Biopsy 157+6 Inflamed GSM2871805
## GSM2871806 Ulcerative colitis Colon Biopsy 157-3 Non-Inflamed GSM2871806
## GSM2871807 Ulcerative colitis Colon Biopsy 157-4 Non-Inflamed GSM2871807
## status submission_date last_update_date type
## GSM2871805 Public on Apr 01 2018 Dec 01 2017 May 15 2019 SRA
## GSM2871806 Public on Apr 01 2018 Dec 01 2017 May 15 2019 SRA
## GSM2871807 Public on Apr 01 2018 Dec 01 2017 May 15 2019 SRA
## channel_count source_name_ch1 organism_ch1 characteristics_ch1
## GSM2871805 1 Colon_157+6 Homo sapiens status: Inflamed
## GSM2871806 1 Colon_157-3 Homo sapiens status: Non-Inflamed
## GSM2871807 1 Colon_157-4 Homo sapiens status: Non-Inflamed
## characteristics_ch1.1 characteristics_ch1.2
## GSM2871805 site: Boston Children's Hospital location: Descending
## GSM2871806 site: Boston Children's Hospital location: Rectum
## GSM2871807 site: Boston Children's Hospital location: Rectum
## characteristics_ch1.3
## GSM2871805 subject: 157
## GSM2871806 subject: 157
## GSM2871807 subject: 157
## treatment_protocol_ch1
## GSM2871805 Colon biopsies were placed in RNAlater and stored at -80 until processed
## GSM2871806 Colon biopsies were placed in RNAlater and stored at -80 until processed
## GSM2871807 Colon biopsies were placed in RNAlater and stored at -80 until processed
## molecule_ch1
## GSM2871805 total RNA
## GSM2871806 total RNA
## GSM2871807 total RNA
## extract_protocol_ch1
## GSM2871805 Qiazol extraction of total RNA was performed according to the manufacturer's instructions.
## GSM2871806 Qiazol extraction of total RNA was performed according to the manufacturer's instructions.
## GSM2871807 Qiazol extraction of total RNA was performed according to the manufacturer's instructions.
## extract_protocol_ch1.1
## GSM2871805 Libraries were constructed using Illumina's Neoprep system according to the manufacturers instructions
## GSM2871806 Libraries were constructed using Illumina's Neoprep system according to the manufacturers instructions
## GSM2871807 Libraries were constructed using Illumina's Neoprep system according to the manufacturers instructions
## extract_protocol_ch1.2 taxid_ch1 description
## GSM2871805 stranded polyA based prep 9606 raw_reads_BCHRNAseq.xlsx
## GSM2871806 stranded polyA based prep 9606 raw_reads_BCHRNAseq.xlsx
## GSM2871807 stranded polyA based prep 9606 raw_reads_BCHRNAseq.xlsx
## data_processing
## GSM2871805 STAR used for mapping and alignment 2.4.0k
## GSM2871806 STAR used for mapping and alignment 2.4.0k
## GSM2871807 STAR used for mapping and alignment 2.4.0k
## data_processing.1
## GSM2871805 Featurecounts used for gene quantification 1.4.6
## GSM2871806 Featurecounts used for gene quantification 1.4.6
## GSM2871807 Featurecounts used for gene quantification 1.4.6
## data_processing.2 data_processing.3
## GSM2871805 EdgerR used for differential analysis Genome_build: Hg38
## GSM2871806 EdgerR used for differential analysis Genome_build: Hg38
## GSM2871807 EdgerR used for differential analysis Genome_build: Hg38
## data_processing.4
## GSM2871805 Supplementary_files_format_and_content: .xlsx contains raw reads for all samples
## GSM2871806 Supplementary_files_format_and_content: .xlsx contains raw reads for all samples
## GSM2871807 Supplementary_files_format_and_content: .xlsx contains raw reads for all samples
## platform_id contact_name contact_email
## GSM2871805 GPL18573 William,,Gordon william.gordon@pfizer.com
## GSM2871806 GPL18573 William,,Gordon william.gordon@pfizer.com
## GSM2871807 GPL18573 William,,Gordon william.gordon@pfizer.com
## contact_institute contact_address contact_city
## GSM2871805 Pfizer Inc 1 Portland Street Cambridge
## GSM2871806 Pfizer Inc 1 Portland Street Cambridge
## GSM2871807 Pfizer Inc 1 Portland Street Cambridge
## contact_zip/postal_code contact_country data_row_count
## GSM2871805 02139 USA 0
## GSM2871806 02139 USA 0
## GSM2871807 02139 USA 0
## instrument_model library_selection library_source
## GSM2871805 Illumina NextSeq 500 cDNA transcriptomic
## GSM2871806 Illumina NextSeq 500 cDNA transcriptomic
## GSM2871807 Illumina NextSeq 500 cDNA transcriptomic
## library_strategy
## GSM2871805 RNA-Seq
## GSM2871806 RNA-Seq
## GSM2871807 RNA-Seq
## relation
## GSM2871805 BioSample: https://www.ncbi.nlm.nih.gov/biosample/SAMN08118484
## GSM2871806 BioSample: https://www.ncbi.nlm.nih.gov/biosample/SAMN08118483
## GSM2871807 BioSample: https://www.ncbi.nlm.nih.gov/biosample/SAMN08118482
## relation.1
## GSM2871805 SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX3436775
## GSM2871806 SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX3436776
## GSM2871807 SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX3436777
## supplementary_file_1 location:ch1 site:ch1
## GSM2871805 NONE Descending Boston Children's Hospital
## GSM2871806 NONE Rectum Boston Children's Hospital
## GSM2871807 NONE Rectum Boston Children's Hospital
## status:ch1 subject:ch1
## GSM2871805 Inflamed 157
## GSM2871806 Non-Inflamed 157
## GSM2871807 Non-Inflamed 157
In the metadata we have information about each of the sample, their status (inflamed or not inflamed), how they are processed, the description, … For our analysis, we do not require all the 45 columns but only some columns of interest.
Therefore we are going to select only some of the columns and modify the column source_name_ch1 in order to fit with the name of the columns in the gene expression dataframe.
# select the columns you want and modify the samples column to fit the one of the gene expression
metadata.modify <- metadata %>%
dplyr::select('source_name_ch1','location:ch1','site:ch1','status:ch1', 'subject:ch1') %>%
mutate(source_name_ch1 = gsub("Colon_", "X",source_name_ch1)) %>%
mutate(source_name_ch1 = gsub(" ", ".",source_name_ch1)) %>% # because of 3 samples
mutate(source_name_ch1 = gsub("+", ".",source_name_ch1, fixed = TRUE)) %>%
mutate(source_name_ch1 = gsub("-", ".",source_name_ch1, fixed = TRUE))
# modify the 3 values for the samples that had the same number positive and negative
metadata.modify$source_name_ch1[4] <- paste0(metadata.modify$source_name_ch1[4], '.1') # for sample 157+3
metadata.modify$source_name_ch1[15] <- paste0(metadata.modify$source_name_ch1[15], '.1') # for sample 1077+1
metadata.modify$source_name_ch1[23] <- paste0(metadata.modify$source_name_ch1[23], '.1') # for sample 1214+4
# Renaming columns
colnames(metadata.modify) <- c('samples','location', 'site', 'status', 'subject')
#show the first lines of the metadata
head(metadata.modify)
## samples location site status subject
## GSM2871805 X157.6 Descending Boston Children's Hospital Inflamed 157
## GSM2871806 X157.3 Rectum Boston Children's Hospital Non-Inflamed 157
## GSM2871807 X157.4 Rectum Boston Children's Hospital Non-Inflamed 157
## GSM2871808 X157.3.1 Tranverse Boston Children's Hospital Inflamed 157
## GSM2871809 X877.3 Rectum Boston Children's Hospital Inflamed 877
## GSM2871810 X877.6 Sigmoid Boston Children's Hospital Inflamed 877
Then we should order the 48 lines by a certain value in order it to be the same as the order of the columns in the gene expression data.
# same ordering between the columns of gene expression data
metadata.modify <- metadata.modify[match(colnames(counts_data), metadata.modify$samples),]
# rename the rows
rownames(metadata.modify) <- metadata.modify$samples
# remove the samples column
metadata.modify <- metadata.modify[,-c(1)]
# show the data
head(metadata.modify)
## location site status subject
## X877.2 Ascending Boston Children's Hospital Non-Inflamed 877
## X1057.1 Ascending Boston Children's Hospital Non-Inflamed 1057
## X1057.2 Ascending Boston Children's Hospital Non-Inflamed 1057
## X1077.1 Ascending Boston Children's Hospital Non-Inflamed 1077
## X1077.2 Ascending Boston Children's Hospital Non-Inflamed 1077
## X8878B1 Ascending Brigham and Women's Hospital Non-Inflamed 8878
Check that the ordering is right and that all the column values match the row names of the Metadata
all(colnames(counts_data) %in% rownames(metadata.modify))
## [1] TRUE
all(colnames(counts_data) == rownames(metadata.modify))
## [1] TRUE
We are going to use the package DESeq2 for the questions of this assignment. But before starting to really answer the questions we will introduce this package and make a differential expression analysis with our data. And especially calculate the DESeq2 objects that we will need.
The first step is to construct a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = counts_data, # the data with 48 biopies and EnsemblId as rownames
colData = metadata.modify, # the Metadata with four variables
design = ~ status) # Inflamed or Not Inflamed
# Let's take a look at that
dds
## class: DESeqDataSet
## dim: 60498 48
## metadata(1): version
## assays(1): counts
## rownames(60498): ENSG00000000003.14 ENSG00000000005.5 ...
## ENSGR0000280767.2 ENSGR0000281849.2
## rowData names(0):
## colnames(48): X877.2 X1057.1 ... X8879A1 X8879B1
## colData names(4): location site status subject
Let’s perform some pre-processing to help to reduce the size of the dds object as well as increase the speed of computation : removing the rows with low gene counts and keeping the rows that have at least 10 reads total.
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep, ]
dds
## class: DESeqDataSet
## dim: 30952 48
## metadata(1): version
## assays(1): counts
## rownames(30952): ENSG00000000003.14 ENSG00000000005.5 ...
## ENSG00000282787.1 ENSG00000282793.1
## rowData names(0):
## colnames(48): X877.2 X1057.1 ... X8879A1 X8879B1
## colData names(4): location site status subject
We can see that we went from 60498 genes to 30952 genes. We filtered out many rows that had low gene counts.
Then we want to compare the Inflamed VS the Non-Inflamed levels. We want to set the factor level to the Non-Inflamed. (Otherwise the choice will by done alphabetically).
dds$status <- relevel(dds$status, ref = "Non-Inflamed")
Let’s run the DESeq function now :
dds <- DESeq(dds)
Since we have 30952 genes and we only want the 500 the most variable genes, we have to remove : 1- 500*100/30952 (cross-product)
vsdata <- vst(dds, blind=FALSE) # estimate dispersion trend and apply a variance stabilizing transformation
vst <- assay(vsdata)
proportion = 1-500/30952
p <- pca(vst, metadata = metadata.modify, removeVar = proportion ) # PCA
## -- removing the lower 98.3845955027139% of variables based on variance
length(p$xvars)
## [1] 500
Indeed, only 500 genes are left.
screeplot(p, axisLabSize = 8, titleLabSize = 20)
We can see on this plot that 59% of the information (variances) contained in the data is retained by the first principal component.
Let’s make a biplot of the individuals and of the 10 most important variables. We color by the status.
biplot(p, showLoadings = TRUE,
labSize = 2,
pointSize = 2,
sizeLoadingsNames = 3,
colby = 'status',
colLegendTitle = 'Status',
legendPosition = "right",
legendLabSize = 20,
legendTitleSize = 20)
The metadata variable status which gives us an information about the inflammation of different areas of the colon (Inflamed or not Inflamed) is best associated with the first principal component (on the x-axis). We can clearly distinguish two groups : on the right, biopsies collected from inflamed colon areas and on the left biopsies collected from non-inflamed colon areas.
TOP 10 genes contributing to principal component 1
# We order by the absolute value of the loadings.
top10_genes_PCA1 <- p$loadings[order(abs(p$loadings[,1]), decreasing = TRUE),]
head(top10_genes_PCA1[1],10)
## PC1
## ENSG00000103375.10 -0.12881958
## ENSG00000140279.12 0.11031204
## ENSG00000140274.13 0.10792952
## ENSG00000211897.7 0.10198018
## ENSG00000134240.11 -0.09840140
## ENSG00000211896.6 0.09698737
## ENSG00000133048.12 0.09400939
## ENSG00000268104.2 0.09229574
## ENSG00000124102.4 0.09218422
## ENSG00000115386.5 0.09183000
We have 2 negative loadings and 8 positive loadings.
top10_genes_PCA1 <- head(rownames(p$loadings[order(abs(p$loadings[,1]), decreasing = TRUE),]), 10)
top10_genes_PCA1
## [1] "ENSG00000103375.10" "ENSG00000140279.12" "ENSG00000140274.13"
## [4] "ENSG00000211897.7" "ENSG00000134240.11" "ENSG00000211896.6"
## [7] "ENSG00000133048.12" "ENSG00000268104.2" "ENSG00000124102.4"
## [10] "ENSG00000115386.5"
Let’s see the name and information of those 10 genes contributing the most to PCA1.
data[which(data$Row %in% top10_genes_PCA1),][1:9]
## Row Type gene_name gene_type chr start
## 2812 ENSG00000103375.10 Ensembl AQP8 protein_coding chr16 25215731
## 4421 ENSG00000115386.5 Ensembl REG1A protein_coding chr2 79120362
## 5453 ENSG00000124102.4 Ensembl PI3 protein_coding chr20 45174876
## 6705 ENSG00000133048.12 Ensembl CHI3L1 protein_coding chr1 203178931
## 6861 ENSG00000134240.11 Ensembl HMGCS2 protein_coding chr1 119747996
## 7937 ENSG00000140274.13 Ensembl DUOXA2 protein_coding chr15 45114321
## 7938 ENSG00000140279.12 Ensembl DUOX2 protein_coding chr15 45092650
## 21860 ENSG00000211896.6 Ensembl IGHG1 IG_C_gene chr14 105668113
## 21861 ENSG00000211897.7 Ensembl IGHG3 IG_C_gene chr14 105769102
## 51892 ENSG00000268104.2 Ensembl SLC6A14 protein_coding chrX 116436622
## end strand length
## 2812 25228940 + 1421
## 4421 79123419 + 1725
## 5453 45176544 + 577
## 6705 203186749 - 3408
## 6861 119768905 - 2571
## 7937 45118421 + 2554
## 7938 45114344 - 9834
## 21860 106538344 - 5138
## 21861 106038345 - 2166
## 51892 116461458 + 4520
We can notice that 8/10 are proteine coding genes and 2/10 are IG_C_gene.
TOP 10 genes contributing to principal component 2 :
# We order by the absolute value of the loadings.
top10_genes_PCA2 <- p$loadings[order(abs(p$loadings[,2]), decreasing = TRUE),]
head(top10_genes_PCA2[2],10)
## PC2
## ENSG00000229807.9 0.3393490
## ENSG00000129824.15 -0.2762467
## ENSG00000012817.15 -0.2560296
## ENSG00000131002.11 -0.2502176
## ENSG00000067048.16 -0.2383918
## ENSG00000099725.14 -0.2031069
## ENSG00000114374.12 -0.1893082
## ENSG00000198692.9 -0.1832433
## ENSG00000183878.15 -0.1821680
## ENSG00000067646.11 -0.1682262
We have 9 negative loadings and 1 positive loadings.
top10_genes_PCA2 <- head(rownames(p$loadings[order(abs(p$loadings[,2]), decreasing = TRUE),]), 10)
top10_genes_PCA2
## [1] "ENSG00000229807.9" "ENSG00000129824.15" "ENSG00000012817.15"
## [4] "ENSG00000131002.11" "ENSG00000067048.16" "ENSG00000099725.14"
## [7] "ENSG00000114374.12" "ENSG00000198692.9" "ENSG00000183878.15"
## [10] "ENSG00000067646.11"
Name and information of those 10 genes contributing the most to PCA1.
data[which(data$Row %in% top10_genes_PCA2),][1:9]
## Row Type gene_name gene_type
## 321 ENSG00000012817.15 Ensembl KDM5D protein_coding
## 993 ENSG00000067048.16 Ensembl DDX3Y protein_coding
## 1017 ENSG00000067646.11 Ensembl ZFY protein_coding
## 2109 ENSG00000099725.14 Ensembl PRKY transcribed_unprocessed_pseudogene
## 4279 ENSG00000114374.12 Ensembl USP9Y protein_coding
## 6181 ENSG00000129824.15 Ensembl RPS4Y1 protein_coding
## 6380 ENSG00000131002.11 Ensembl TXLNGY transcribed_unprocessed_pseudogene
## 15635 ENSG00000183878.15 Ensembl UTY protein_coding
## 17863 ENSG00000198692.9 Ensembl EIF1AY protein_coding
## 29822 ENSG00000229807.9 Ensembl XIST lincRNA
## chr start end strand length
## 321 chrY 19703865 19744939 - 9463
## 993 chrY 12904108 12920478 + 5738
## 1017 chrY 2935281 2982506 + 5527
## 2109 chrY 7273972 7381548 + 7674
## 4279 chrY 12701231 12860839 + 11237
## 6181 chrY 2841486 2932000 + 2275
## 6380 chrY 19567313 19606274 + 11377
## 15635 chrY 13248379 13480673 - 9191
## 17863 chrY 20575725 20593154 + 4074
## 29822 chrX 73820651 73852753 - 19931
We can notice that 9/10 genes best associated with PC2 are located in the Y chromosome and the 10th is on the X chromosome. (We noticed that the only one with negative loading is the one on Chr X) And 7/10 are protein coding genes.
Let’s save and explore the results of the dds.
# we put a threshold of padj = 0.05
res <- results(dds, alpha = 0.05, lfcThreshold = 0) # alpha is padj
res
## log2 fold change (MLE): status Inflamed vs Non.Inflamed
## Wald test p-value: status Inflamed vs Non.Inflamed
## DataFrame with 30952 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003.14 1353.7538 -0.826819 0.1434335 -5.76447 8.19140e-09
## ENSG00000000005.5 12.1653 -2.142063 0.2763982 -7.74992 9.19532e-15
## ENSG00000000419.12 635.4017 -0.178848 0.0496491 -3.60223 3.15495e-04
## ENSG00000000457.13 434.0679 -0.285682 0.0588203 -4.85687 1.19258e-06
## ENSG00000000460.16 137.0904 0.167074 0.0944149 1.76958 7.67977e-02
## ... ... ... ... ... ...
## ENSG00000282772.1 0.943429 -0.292489 0.532366 -0.549414 0.5827211
## ENSG00000282774.1 3.979929 0.590956 0.306425 1.928550 0.0537867
## ENSG00000282780.1 2.766038 0.962155 0.524902 1.833020 0.0667996
## ENSG00000282787.1 1.046650 1.108023 0.583690 1.898308 0.0576556
## ENSG00000282793.1 1.033749 -0.635093 0.570773 -1.112690 0.2658415
## padj
## <numeric>
## ENSG00000000003.14 4.36559e-08
## ENSG00000000005.5 1.00239e-13
## ENSG00000000419.12 8.96792e-04
## ENSG00000000457.13 4.86543e-06
## ENSG00000000460.16 1.29547e-01
## ... ...
## ENSG00000282772.1 0.6801433
## ENSG00000282774.1 0.0952494
## ENSG00000282780.1 0.1147905
## ENSG00000282787.1 0.1011781
## ENSG00000282793.1 0.3668508
The log2 fold change is calculated between the Inflamed vs Non.Inflamed levels. The statistical test used is the Wald test. This dataframe has multiple columns. - baseMean is the average of the normalized count taken over all the samples. - the log2FoldChange is the fold change of the gene in the Inflamed condition and compared to the Non.Inflamed The >0 values are the up- regulated genes in the Inflamed condition. The <0 values are the down-regulated genes in the Inflamed condition. - lfcSE : Standard Estimates for the Log2Fold change - stat : stat values for the wald test for each gene - pvalue : pvalue of the test statistics for each gene - padj : corrected pvalue for multiple testing (indeed 5% of our deferentially expressed genes are not really deferentially expressed but only due to random chances : they are false positive. 5% of 30k is a lot )
Let’s see the summary of the results :
summary(res)
##
## out of 30951 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 8076, 26%
## LFC < 0 (down) : 6652, 21%
## outliers [1] : 0, 0%
## low counts [2] : 2402, 7.8%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
8076 genes are upregulated 6652 genes are down regulated.
Make an MA plot to visualise the results.
plotMA(res)
We want to select the top 20 genes increased or decreased in inflamed biopsies Therefore, we order the genes by the lowest padj and select the TOP 20
res <- res[order(res$padj),] # order by lowest padj
head(res, 10)
## log2 fold change (MLE): status Inflamed vs Non.Inflamed
## Wald test p-value: status Inflamed vs Non.Inflamed
## DataFrame with 10 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000170382.11 203.7034 -3.80891 0.2252811 -16.9074 3.97151e-64
## ENSG00000185133.13 426.0919 -2.36644 0.1430813 -16.5391 1.91833e-61
## ENSG00000169035.11 44.8617 24.93639 1.5184976 16.4218 1.33652e-60
## ENSG00000166278.14 427.2735 2.00442 0.1244245 16.1095 2.18700e-58
## ENSG00000033170.16 794.7164 1.21777 0.0782077 15.5709 1.14694e-54
## ENSG00000168060.14 216.8451 -2.79997 0.1805221 -15.5104 2.95077e-54
## ENSG00000171385.9 428.9475 3.35545 0.2171236 15.4541 7.08049e-54
## ENSG00000165269.12 169.2890 -3.73378 0.2466353 -15.1389 8.97499e-52
## ENSG00000165548.10 160.5841 -3.40195 0.2253221 -15.0982 1.66500e-51
## ENSG00000137673.8 383.2655 7.86562 0.5228294 15.0443 3.76102e-51
## padj
## <numeric>
## ENSG00000170382.11 1.13387e-59
## ENSG00000185133.13 2.73842e-57
## ENSG00000169035.11 1.27192e-56
## ENSG00000166278.14 1.56097e-54
## ENSG00000033170.16 6.54903e-51
## ENSG00000168060.14 1.40407e-50
## ENSG00000171385.9 2.88783e-50
## ENSG00000165269.12 3.20295e-48
## ENSG00000165548.10 5.28175e-48
## ENSG00000137673.8 1.07377e-47
top20_lowest_padj <- head(rownames(res), 20)
top20_lowest_padj
## [1] "ENSG00000170382.11" "ENSG00000185133.13" "ENSG00000169035.11"
## [4] "ENSG00000166278.14" "ENSG00000033170.16" "ENSG00000168060.14"
## [7] "ENSG00000171385.9" "ENSG00000165269.12" "ENSG00000165548.10"
## [10] "ENSG00000137673.8" "ENSG00000162383.11" "ENSG00000164626.8"
## [13] "ENSG00000176472.10" "ENSG00000176273.14" "ENSG00000138669.9"
## [16] "ENSG00000062524.15" "ENSG00000172215.5" "ENSG00000141447.16"
## [19] "ENSG00000154122.12" "ENSG00000171503.11"
Name and information of those 20 genes with the lowest padj.
data[which(data$Row %in% top20_lowest_padj),][1:9]
## Row Type gene_name gene_type chr start
## 502 ENSG00000033170.16 Ensembl FUT8 protein_coding chr14 65410592
## 843 ENSG00000062524.15 Ensembl LTK protein_coding chr15 41503638
## 7509 ENSG00000137673.8 Ensembl MMP7 protein_coding chr11 102520508
## 7711 ENSG00000138669.9 Ensembl PRKG2 protein_coding chr4 81087370
## 8114 ENSG00000141447.16 Ensembl OSBPL1A protein_coding chr18 24162044
## 9728 ENSG00000154122.12 Ensembl ANKH protein_coding chr5 14704804
## 10721 ENSG00000162383.11 Ensembl SLC1A7 protein_coding chr1 53087179
## 11405 ENSG00000164626.8 Ensembl KCNK5 protein_coding chr6 39188973
## 11570 ENSG00000165269.12 Ensembl AQP7 protein_coding chr9 33383179
## 11631 ENSG00000165548.10 Ensembl TMEM63C protein_coding chr14 77116568
## 11819 ENSG00000166278.14 Ensembl C2 protein_coding chr6 31897785
## 12277 ENSG00000168060.14 Ensembl NAALADL1 protein_coding chr11 65044818
## 12509 ENSG00000169035.11 Ensembl KLK7 protein_coding chr19 50976482
## 12818 ENSG00000170382.11 Ensembl LRRN2 protein_coding chr1 204617170
## 13050 ENSG00000171385.9 Ensembl KCND3 protein_coding chr1 111770662
## 13093 ENSG00000171503.11 Ensembl ETFDH protein_coding chr4 158672125
## 13260 ENSG00000172215.5 Ensembl CXCR6 protein_coding chr3 45940933
## 14068 ENSG00000176273.14 Ensembl SLC35G1 protein_coding chr10 93893973
## 14105 ENSG00000176472.10 Ensembl ZNF575 protein_coding chr19 43525497
## 15937 ENSG00000185133.13 Ensembl INPP5J protein_coding chr22 31122731
## end strand length
## 502 65744121 + 6700
## 843 41513887 - 3490
## 7509 102530753 - 2323
## 7711 81215117 - 5116
## 8114 24397880 - 8022
## 9728 14871778 - 10865
## 10721 53142632 - 5407
## 11405 39229450 - 3756
## 11570 33402682 - 5253
## 11631 77259495 + 6363
## 11819 31945672 + 5536
## 12277 65058549 - 4044
## 12509 50984099 - 2434
## 12818 204685733 - 5681
## 13050 111989155 - 7863
## 13093 158709623 + 4493
## 13260 45948353 + 3769
## 14068 93956062 + 6829
## 14105 43536130 + 3222
## 15937 31134696 + 4822
Those genes are the Top20 genes increased or decreased in inflamed biopsies. Let’s make a volcano plot to show them :
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
xlim=c(-10,10),
y = 'pvalue',
selectLab = top20_lowest_padj,
title = "EnhancedVolcano plot ",
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = res$padj[19], # to see only the 20 genes that have been selected
FCcutoff = 0,
pointSize = 1.0,
labSize = 3,
labCol = 'black',
labFace = 'bold',
#boxedLabels = TRUE,
colAlpha = 3/5,
legendPosition = 'top',
legendLabSize = 14,
legendIconSize = 4.0,
drawConnectors = TRUE,
widthConnectors = 0.1,
colConnectors = 'black')
In order to see if these 20 genes display a concordant profile across the patients, we will plot for each of them the counts of reads for a single gene across the groups.
par(mfrow=c(2,3))
for (val in 1: length(top20_lowest_padj))
{
plotCounts(dds, gene=top20_lowest_padj[val], intgroup="status")
}
We can see that for those 20 genes, we can each time distinguish 2 groups between the Non-Inflamed and Inflamed patients. It is more ambiguous for gene ENSG00000169035.11.
df <- as.data.frame(colData(dds)[,c("status","site")])
pheatmap(vst[top20_lowest_padj,], cluster_rows=TRUE, show_rownames=TRUE,
cluster_cols=TRUE, annotation_col = df[1])
These top 20 genes display a concordant profile across the patients. We can distinguish e sily distinguish the two groups on the heatmap. Some genes seem to be under-expressed in the Inflamed patients whereas others seem to be overexpressed.
Gene set enrichment analysis (GSEA) (also functional enrichment analysis) is a method to identify classes of genes or proteins that are over-represented in a large set of genes or proteins, and may have an association with disease phenotype.
The list of deferentially expressed genes is sometimes so long that its interpretation becomes cumbersome and time consuming. A common downstream procedure is gene set testing. It aims to understand which pathways or gene networks the deferentially expressed genes are implicated in.
Get gene set data
First, we extract the lists of genes in C2 terms from the package msigdbr. We format it as a data frame so we can use the tidyverse.
#C2
C2 <- as.data.frame(msigdbr(species = "Homo sapiens",
category = "C2",
subcategory = "CP:REACTOME"))
head(C2,3)
## gs_cat gs_subcat gs_name gene_symbol entrez_gene
## 1 C2 CP:REACTOME REACTOME_2_LTR_CIRCLE_FORMATION BANF1 8815
## 2 C2 CP:REACTOME REACTOME_2_LTR_CIRCLE_FORMATION HMGA1 3159
## 3 C2 CP:REACTOME REACTOME_2_LTR_CIRCLE_FORMATION LIG4 3981
## ensembl_gene human_gene_symbol human_entrez_gene human_ensembl_gene gs_id
## 1 ENSG00000175334 BANF1 8815 ENSG00000175334 M26996
## 2 ENSG00000137309 HMGA1 3159 ENSG00000137309 M26996
## 3 ENSG00000174405 LIG4 3981 ENSG00000174405 M26996
## gs_pmid gs_geoid gs_exact_source
## 1 R-HSA-164843
## 2 R-HSA-164843
## 3 R-HSA-164843
## gs_url
## 1 https://www.reactome.org/content/detail/R-HSA-164843|https://reactome.org/PathwayBrowser/#/R-HSA-164843
## 2 https://www.reactome.org/content/detail/R-HSA-164843|https://reactome.org/PathwayBrowser/#/R-HSA-164843
## 3 https://www.reactome.org/content/detail/R-HSA-164843|https://reactome.org/PathwayBrowser/#/R-HSA-164843
## gs_description
## 1 2-LTR circle formation
## 2 2-LTR circle formation
## 3 2-LTR circle formation
Here, we see the start of the C2 data including :
Extract the matrix of genes and the log2FoldChange values
GSEA compares mean gene expression fold change between two states. We extract the genes and the log2FoldChange from the results of DESeq2 and we sort the list in decreasing order of L2FC (required for clusterProfiler).
# we want the log2 fold change
gene_matrix <- res$log2FoldChange
# name the vector by removing everything that is after the "." in the rownames of the ensemblId
names(gene_matrix) <- gsub("\\..*","", rownames(res))
# omit any NA values
gene_matrix<-na.omit(gene_matrix)
# sort the list in decreasing order (required for clusterProfiler)
gene_matrix = sort(gene_matrix, decreasing = TRUE)
head(gene_matrix)
## ENSG00000169035 ENSG00000167755 ENSG00000205420 ENSG00000057149 ENSG00000206073
## 24.936391 9.283336 8.854681 8.446746 8.294018
## ENSG00000162892
## 8.112976
This is the format we need for GSEA, for the package to run
Run GSEA
gsea_results <- GSEA(
geneList = gene_matrix, # Ordered ranked gene list
minGSSize = 25, # Minimum gene set size
maxGSSize = 500, # Maximum gene set set
pvalueCutoff = 0.05, # p-value cutoff
eps = 0, # Boundary for calculating the p value
seed = TRUE, # Set seed to make results reproducible
pAdjustMethod = "BH", # Benjamini-Hochberg correction
TERM2GENE = dplyr::select(
C2,
gs_name,
ensembl_gene
)
)
We can access the results from our gsea_results object using @result Let’s convert the contents of result into a data frame that we can use for further analysis.
gsea_result_df <- data.frame(gsea_results@result)
Visualize significant GSEA
The most common visualization for GSEA is the enrichment or normalized enrichment score. Let’s plot NES for the subset of significant GSEA at FDR < 0.05. This is actually pretty strict for GSEA and people often go up to FDR < 0.2.
gsea_result_df %>%
filter(p.adjust <= 0.05) %>% # filter only those we a certain value of padj
#Beautify descriptions by removing _ and REACTOME
mutate(ID = gsub("REACTOME_","", ID),
ID = gsub("_"," ", ID)) %>%
ggplot(aes(x=reorder(ID, NES), #Reorder gene sets by NES values
y=NES)) +
geom_col() +
theme_classic() +
#Force equal max min
#lims(y=c(-3.2,3.2)) +
#Some more customization to pretty it up
#Flip x and y so long labels can be read
coord_flip() +
#fix labels
labs(y="Normalized enrichment score (NES)",
x="Gene set",
title = " Reactom GSEA (padj<0.05) ") +
theme(axis.text.y = element_text(face="bold", color="black", size=2))
NES > 0 (positive enrichment), corresponds to higher expression in Inflamed areas (up-regulated) NES < 0 (negative enrichment), corresponds to lower expression in Inflamed areas (down regulated)
Most Positive NES
Let’s look at the 3 gene sets with the most positive NES.
gsea_result_df %>%
# This returns the 3 rows with the largest NES values
dplyr::slice_max(NES, n = 3)
## ID
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT
## REACTOME_CD22_MEDIATED_BCR_REGULATION REACTOME_CD22_MEDIATED_BCR_REGULATION
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS
## Description
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT
## REACTOME_CD22_MEDIATED_BCR_REGULATION REACTOME_CD22_MEDIATED_BCR_REGULATION
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS
## setSize enrichmentScore NES
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 75 0.8874821 2.840290
## REACTOME_CD22_MEDIATED_BCR_REGULATION 59 0.9166868 2.838644
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 67 0.8937998 2.821065
## pvalue p.adjust
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 4.960481e-33 1.228546e-30
## REACTOME_CD22_MEDIATED_BCR_REGULATION 1.029925e-30 1.275390e-28
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 3.631240e-30 3.854302e-28
## qvalues rank
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 9.224753e-31 2106
## REACTOME_CD22_MEDIATED_BCR_REGULATION 9.576493e-29 1924
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS 2.894071e-28 2106
## leading_edge
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT tags=87%, list=7%, signal=81%
## REACTOME_CD22_MEDIATED_BCR_REGULATION tags=95%, list=6%, signal=89%
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS tags=88%, list=7%, signal=82%
## core_enrichment
## REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT ENSG00000211896/ENSG00000211892/ENSG00000274576/ENSG00000251039/ENSG00000142748/ENSG00000211893/ENSG00000211956/ENSG00000242371/ENSG00000211658/ENSG00000211973/ENSG00000211662/ENSG00000127241/ENSG00000242076/ENSG00000211651/ENSG00000211648/ENSG00000211659/ENSG00000085265/ENSG00000211663/ENSG00000244116/ENSG00000211653/ENSG00000270550/ENSG00000243466/ENSG00000244437/ENSG00000211962/ENSG00000211673/ENSG00000211959/ENSG00000211941/ENSG00000211640/ENSG00000211967/ENSG00000211644/ENSG00000211677/ENSG00000239951/ENSG00000211660/ENSG00000242534/ENSG00000224373/ENSG00000211964/ENSG00000211666/ENSG00000240382/ENSG00000211955/ENSG00000211679/ENSG00000211942/ENSG00000211949/ENSG00000211934/ENSG00000243649/ENSG00000211668/ENSG00000125730/ENSG00000241351/ENSG00000240864/ENSG00000211598/ENSG00000251546/ENSG00000211625/ENSG00000278196/ENSG00000211599/ENSG00000211652/ENSG00000166278/ENSG00000224389/ENSG00000244731/ENSG00000278857/ENSG00000241244/ENSG00000243290/ENSG00000159403/ENSG00000243238/ENSG00000239975/ENSG00000126759/ENSG00000182326
## REACTOME_CD22_MEDIATED_BCR_REGULATION ENSG00000274576/ENSG00000251039/ENSG00000211956/ENSG00000242371/ENSG00000211658/ENSG00000211973/ENSG00000211662/ENSG00000242076/ENSG00000211651/ENSG00000211648/ENSG00000211659/ENSG00000211663/ENSG00000244116/ENSG00000211653/ENSG00000270550/ENSG00000243466/ENSG00000244437/ENSG00000211962/ENSG00000211673/ENSG00000211898/ENSG00000211959/ENSG00000211941/ENSG00000211640/ENSG00000105369/ENSG00000211967/ENSG00000211644/ENSG00000211677/ENSG00000239951/ENSG00000211660/ENSG00000242534/ENSG00000224373/ENSG00000211964/ENSG00000211666/ENSG00000240382/ENSG00000211955/ENSG00000211679/ENSG00000211942/ENSG00000211949/ENSG00000211934/ENSG00000211668/ENSG00000241351/ENSG00000240864/ENSG00000211598/ENSG00000251546/ENSG00000211625/ENSG00000007312/ENSG00000278196/ENSG00000012124/ENSG00000211899/ENSG00000211599/ENSG00000211652/ENSG00000278857/ENSG00000241244/ENSG00000243290/ENSG00000243238/ENSG00000239975
## REACTOME_CREATION_OF_C4_AND_C2_ACTIVATORS ENSG00000211896/ENSG00000211892/ENSG00000274576/ENSG00000251039/ENSG00000142748/ENSG00000211893/ENSG00000211956/ENSG00000242371/ENSG00000211658/ENSG00000211973/ENSG00000211662/ENSG00000127241/ENSG00000242076/ENSG00000211651/ENSG00000211648/ENSG00000211659/ENSG00000085265/ENSG00000211663/ENSG00000244116/ENSG00000211653/ENSG00000270550/ENSG00000243466/ENSG00000244437/ENSG00000211962/ENSG00000211673/ENSG00000211959/ENSG00000211941/ENSG00000211640/ENSG00000211967/ENSG00000211644/ENSG00000211677/ENSG00000239951/ENSG00000211660/ENSG00000242534/ENSG00000224373/ENSG00000211964/ENSG00000211666/ENSG00000240382/ENSG00000211955/ENSG00000211679/ENSG00000211942/ENSG00000211949/ENSG00000211934/ENSG00000211668/ENSG00000241351/ENSG00000240864/ENSG00000211598/ENSG00000251546/ENSG00000211625/ENSG00000278196/ENSG00000211599/ENSG00000211652/ENSG00000278857/ENSG00000241244/ENSG00000243290/ENSG00000159403/ENSG00000243238/ENSG00000239975/ENSG00000182326
The gene set REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT has the most positive NES score.
It’s GSEA plot (Plot of the Running Enrichment Score (green line) for a gene set as the analysis walks down the ranked gene list, including the location of the maximum enrichment score (the red line). The black lines in the Running Enrichment Score show where the members of the gene set appear in the ranked list of genes, indicating the leading edge subset.)
most_positive_nes_plot <- enrichplot::gseaplot(
gsea_results,
geneSetID = "REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT",
title = "REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT",
color.line = "#0d76ff"
)
most_positive_nes_plot
Notice how the genes that are in the gene set, indicated by the black bars, tend to be on the left side of the graph indicating that they have positive gene-level scores. The red dashed line indicates the enrichment score, which is the maximum deviation from zero. As mentioned earlier, an enrichment is calculated by starting with the most highly ranked genes (according to the gene-level log2 fold changes values) and increasing the score when a gene is in the pathway and decreasing the score when a gene is not in the pathway.
Most Negative NES
Let’s look for the 3 gene sets with the most negative NES.
gsea_result_df %>%
# Return the 3 rows with the smallest (most negative) NES values
dplyr::slice_min(NES, n = 3)
## ID
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS
## Description
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS
## setSize
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT 174
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 102
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS 126
## enrichmentScore
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT -0.6195357
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT -0.6676811
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS -0.6265223
## NES
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT -2.835304
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT -2.754751
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS -2.645439
## pvalue
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT 4.372662e-22
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 1.033983e-16
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS 9.914801e-17
## p.adjust
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT 1.371505e-20
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 2.560831e-15
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS 2.540240e-15
## qvalues
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT 1.029819e-20
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 1.922846e-15
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS 1.907385e-15
## rank
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT 7699
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 7792
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS 7792
## leading_edge
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT tags=78%, list=25%, signal=59%
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT tags=77%, list=25%, signal=58%
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS tags=77%, list=25%, signal=58%
## core_enrichment
## REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT ENSG00000166260/ENSG00000124172/ENSG00000147576/ENSG00000110717/ENSG00000240230/ENSG00000115944/ENSG00000151366/ENSG00000067992/ENSG00000065518/ENSG00000143252/ENSG00000101247/ENSG00000119689/ENSG00000135390/ENSG00000165264/ENSG00000123545/ENSG00000067829/ENSG00000145494/ENSG00000184857/ENSG00000133983/ENSG00000125375/ENSG00000100577/ENSG00000167283/ENSG00000139180/ENSG00000115286/ENSG00000156467/ENSG00000105953/ENSG00000241468/ENSG00000172840/ENSG00000063854/ENSG00000184752/ENSG00000170906/ENSG00000186010/ENSG00000178449/ENSG00000136521/ENSG00000183648/ENSG00000090266/ENSG00000173660/ENSG00000119013/ENSG00000101365/ENSG00000130159/ENSG00000004779/ENSG00000099795/ENSG00000110435/ENSG00000167863/ENSG00000184983/ENSG00000119421/ENSG00000112992/ENSG00000109390/ENSG00000166411/ENSG00000213585/ENSG00000180902/ENSG00000087299/ENSG00000100156/ENSG00000150768/ENSG00000164258/ENSG00000152234/ENSG00000176340/ENSG00000126267/ENSG00000213619/ENSG00000127540/ENSG00000131495/ENSG00000198786/ENSG00000158864/ENSG00000164405/ENSG00000146701/ENSG00000198763/ENSG00000184076/ENSG00000128609/ENSG00000099624/ENSG00000241837/ENSG00000167792/ENSG00000133028/ENSG00000151413/ENSG00000140990/ENSG00000127184/ENSG00000125356/ENSG00000060762/ENSG00000091140/ENSG00000091483/ENSG00000154723/ENSG00000130414/ENSG00000159199/ENSG00000198888/ENSG00000073578/ENSG00000138095/ENSG00000131143/ENSG00000136463/ENSG00000062485/ENSG00000165629/ENSG00000112033/ENSG00000136143/ENSG00000228253/ENSG00000105379/ENSG00000140740/ENSG00000179091/ENSG00000143158/ENSG00000131174/ENSG00000169020/ENSG00000198899/ENSG00000116459/ENSG00000147684/ENSG00000198712/ENSG00000010256/ENSG00000186350/ENSG00000169021/ENSG00000023228/ENSG00000117118/ENSG00000154518/ENSG00000198727/ENSG00000164919/ENSG00000198886/ENSG00000131828/ENSG00000198840/ENSG00000110955/ENSG00000180185/ENSG00000111775/ENSG00000166796/ENSG00000100412/ENSG00000178741/ENSG00000198804/ENSG00000135940/ENSG00000082212/ENSG00000198695/ENSG00000163541/ENSG00000204370/ENSG00000140374/ENSG00000198938/ENSG00000004799/ENSG00000172115/ENSG00000172340/ENSG00000171503/ENSG00000151376/ENSG00000005882/ENSG00000172270/ENSG00000155380
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT ENSG00000166260/ENSG00000110717/ENSG00000240230/ENSG00000115944/ENSG00000151366/ENSG00000065518/ENSG00000143252/ENSG00000101247/ENSG00000165264/ENSG00000123545/ENSG00000145494/ENSG00000184857/ENSG00000133983/ENSG00000139180/ENSG00000115286/ENSG00000156467/ENSG00000184752/ENSG00000170906/ENSG00000186010/ENSG00000178449/ENSG00000136521/ENSG00000183648/ENSG00000090266/ENSG00000173660/ENSG00000119013/ENSG00000130159/ENSG00000004779/ENSG00000099795/ENSG00000184983/ENSG00000119421/ENSG00000109390/ENSG00000164258/ENSG00000176340/ENSG00000126267/ENSG00000213619/ENSG00000127540/ENSG00000131495/ENSG00000198786/ENSG00000158864/ENSG00000164405/ENSG00000198763/ENSG00000184076/ENSG00000128609/ENSG00000167792/ENSG00000133028/ENSG00000151413/ENSG00000140990/ENSG00000127184/ENSG00000125356/ENSG00000130414/ENSG00000198888/ENSG00000073578/ENSG00000138095/ENSG00000131143/ENSG00000136463/ENSG00000105379/ENSG00000140740/ENSG00000179091/ENSG00000131174/ENSG00000147684/ENSG00000198712/ENSG00000010256/ENSG00000169021/ENSG00000023228/ENSG00000117118/ENSG00000198727/ENSG00000164919/ENSG00000198886/ENSG00000198840/ENSG00000111775/ENSG00000178741/ENSG00000198804/ENSG00000135940/ENSG00000198695/ENSG00000204370/ENSG00000140374/ENSG00000198938/ENSG00000172115/ENSG00000171503
## REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS ENSG00000166260/ENSG00000124172/ENSG00000110717/ENSG00000240230/ENSG00000115944/ENSG00000151366/ENSG00000065518/ENSG00000143252/ENSG00000101247/ENSG00000135390/ENSG00000165264/ENSG00000123545/ENSG00000145494/ENSG00000184857/ENSG00000133983/ENSG00000125375/ENSG00000167283/ENSG00000139180/ENSG00000115286/ENSG00000156467/ENSG00000241468/ENSG00000184752/ENSG00000170906/ENSG00000186010/ENSG00000178449/ENSG00000136521/ENSG00000183648/ENSG00000090266/ENSG00000173660/ENSG00000119013/ENSG00000130159/ENSG00000004779/ENSG00000099795/ENSG00000167863/ENSG00000184983/ENSG00000119421/ENSG00000109390/ENSG00000164258/ENSG00000152234/ENSG00000176340/ENSG00000126267/ENSG00000213619/ENSG00000127540/ENSG00000131495/ENSG00000198786/ENSG00000158864/ENSG00000164405/ENSG00000198763/ENSG00000184076/ENSG00000128609/ENSG00000099624/ENSG00000241837/ENSG00000167792/ENSG00000133028/ENSG00000151413/ENSG00000140990/ENSG00000127184/ENSG00000125356/ENSG00000154723/ENSG00000130414/ENSG00000159199/ENSG00000198888/ENSG00000073578/ENSG00000138095/ENSG00000131143/ENSG00000136463/ENSG00000165629/ENSG00000228253/ENSG00000105379/ENSG00000140740/ENSG00000179091/ENSG00000131174/ENSG00000169020/ENSG00000198899/ENSG00000116459/ENSG00000147684/ENSG00000198712/ENSG00000010256/ENSG00000169021/ENSG00000023228/ENSG00000117118/ENSG00000154518/ENSG00000198727/ENSG00000164919/ENSG00000198886/ENSG00000198840/ENSG00000110955/ENSG00000111775/ENSG00000178741/ENSG00000198804/ENSG00000135940/ENSG00000198695/ENSG00000204370/ENSG00000140374/ENSG00000198938/ENSG00000172115/ENSG00000171503
The gene set REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT has the most negative NES.
most_negative_nes_plot <- enrichplot::gseaplot(
gsea_results,
geneSetID = "REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT",
title = "REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT",
color.line = "#0d76ff"
)
most_negative_nes_plot
This gene set shows the opposite pattern – genes in the pathway tend to be on the right side of the graph. Again, the red dashed line here indicates the maximum deviation from zero, in other words, the enrichment score. A negative enrichment score will be returned when many genes are near the bottom of the ranked list.
Dot Plots
require(DOSE)
## Le chargement a nécessité le package : DOSE
## DOSE v3.22.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
dotplot(gsea_results, showCategory=3, split=".sign") + facet_grid(.~.sign)
We select the Top significantly genes and their L2FC
The Top significantly up- or down-regulated genes in inflamed biopsies (log2 Fold Change <1.5 or >1.5)
# we want the log2 fold change that are > or < to 1.5
gene_matrix_l2FC_1.5 <-res %>%
as.data.frame() %>%
select('log2FoldChange') %>%
filter(abs(log2FoldChange) > 1.5)
# extract only the column L2FC
gene_matrix_l2FC_1.5 <- gene_matrix_l2FC_1.5$log2FoldChange
# name the vector by removing everything that is after the "." in the rownames of the ensemblId
names(gene_matrix_l2FC_1.5) <- gsub("\\..*","", rownames(gene_matrix_l2FC_1.5))
# omit any NA values
gene_matrix_l2FC_1.5<-na.omit(gene_matrix)
# sort the list in decreasing order (required for clusterProfiler)
gene_matrix_l2FC_1.5 = sort(gene_matrix_l2FC_1.5, decreasing = TRUE)
head(gene_matrix_l2FC_1.5)
## ENSG00000169035 ENSG00000167755 ENSG00000205420 ENSG00000057149 ENSG00000206073
## 24.936391 9.283336 8.854681 8.446746 8.294018
## ENSG00000162892
## 8.112976
Get gene set data
We extract the lists of genes in C8 terms from the package msigdbr. We format it as a data frame so we can use the tidyverse.
#C8
C8 <- as.data.frame(msigdbr(species = "Homo sapiens", category = "C8"))
head(C8)
## gs_cat gs_subcat gs_name gene_symbol entrez_gene
## 1 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 ADGRE5 976
## 2 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 ADGRG1 9289
## 3 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 AKNA 80709
## 4 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 ALOX5AP 241
## 5 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 APOBEC3C 27350
## 6 C8 AIZARANI_LIVER_C1_NK_NKT_CELLS_1 APOBEC3G 60489
## ensembl_gene human_gene_symbol human_entrez_gene human_ensembl_gene gs_id
## 1 ENSG00000123146 ADGRE5 976 ENSG00000123146 M39105
## 2 ENSG00000205336 ADGRG1 9289 ENSG00000205336 M39105
## 3 ENSG00000106948 AKNA 80709 ENSG00000106948 M39105
## 4 ENSG00000132965 ALOX5AP 241 ENSG00000132965 M39105
## 5 ENSG00000244509 APOBEC3C 27350 ENSG00000244509 M39105
## 6 ENSG00000239713 APOBEC3G 60489 ENSG00000239713 M39105
## gs_pmid gs_geoid
## 1 31292543 GSE124395
## 2 31292543 GSE124395
## 3 31292543 GSE124395
## 4 31292543 GSE124395
## 5 31292543 GSE124395
## 6 31292543 GSE124395
## gs_exact_source
## 1 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## 2 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## 3 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## 4 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## 5 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## 6 Supplementary Table 1 Gene expression signatures of all clusters. Official gene symbol, mean expression across all genes not contained within a cluster (mean.ncl), mean expression across all genes contained within a cluster (mean.cl), fold change between cluster and non-cluster cells (fc), differential expression p-value (pv, see Methods), and Benjamini-Hochberg corrected p-value are given. Only genes with P<0.01 are shown for each cluster. (n= 10,372 cells). These cluster markers were further filtered to include only genes with a FC>2.
## gs_url gs_description
## 1 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
## 2 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
## 3 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
## 4 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
## 5 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
## 6 http://human-liver-cell-atlas.ie-freiburg.mpg.de/
This is the format we need for GSEA, for the package to run
Run GSEA
gsea_results_C8 <- GSEA(
geneList = gene_matrix_l2FC_1.5, # Ordered ranked gene list
minGSSize = 25, # Minimum gene set size
maxGSSize = 500, # Maximum gene set set
pvalueCutoff = 0.05, # p-value cutoff
eps = 0, # Boundary for calculating the p value
seed = TRUE, # Set seed to make results reproducible
pAdjustMethod = "BH", # Benjamini-Hochberg correction
TERM2GENE = dplyr::select(
C8,
gs_name,
ensembl_gene
)
)
We can access the results from our gsea_results object using @result Let’s convert the contents of result into a data frame that we can use for further analysis and write to a file later.
gsea_result_df_C8 <- data.frame(gsea_results_C8@result)
Visualize significant GSEA
The most common visualization for GSEA is the enrichment or normalized enrichment score. Let’s plot NES for the subset of significant GSEA at FDR < 0.05. This is actually pretty strict for GSEA and people often go up to FDR < 0.2.
gsea_result_df_C8 %>%
filter(p.adjust <= 0.05) %>% # filter only those we a certain value of padj
#Beautify descriptions by removing _ and REACTOME
mutate(ID = gsub("_"," ", ID)) %>%
ggplot(aes(x=reorder(ID, NES), #Reorder gene sets by NES values
y=NES)) +
geom_col() +
theme_classic() +
#Force equal max min
#lims(y=c(-3.2,3.2)) +
#Some more customization to pretty it up
#Flip x and y so long labels can be read
coord_flip() +
#fix labels
labs(y="Normalized enrichment score (NES)",
x="Gene set",
title = " Reactom GSEA (padj<0.05) ") +
theme(axis.text.y = element_text(face="bold", color="black", size=2))
Most Positive NES
Let’s look at the 5 cell type signature gene sets with the most positive NES.
gsea_result_df_C8 %>%
# This returns the 5 rows with the largest NES values
dplyr::slice_max(NES, n = 5)
## ID
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS DESCARTES_FETAL_LUNG_LYMPHOID_CELLS
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS DESCARTES_FETAL_HEART_LYMPHOID_CELLS
## Description
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS DESCARTES_FETAL_LUNG_LYMPHOID_CELLS
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS DESCARTES_FETAL_HEART_LYMPHOID_CELLS
## setSize enrichmentScore NES
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS 358 0.7452799 2.684867
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS 158 0.7425657 2.553761
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS 154 0.7382550 2.538876
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS 161 0.7331187 2.522597
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS 110 0.7557634 2.510709
## pvalue p.adjust
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS 3.107389e-63 1.783641e-60
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS 6.505232e-27 7.468006e-25
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS 4.548280e-26 3.263391e-24
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS 2.197634e-26 1.802060e-24
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS 2.264748e-20 7.222030e-19
## qvalues rank
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS 7.196059e-61 5696
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS 3.012950e-25 5351
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS 1.316607e-24 6095
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS 7.270367e-25 5842
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS 2.913711e-19 5376
## leading_edge
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS tags=81%, list=18%, signal=67%
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS tags=75%, list=17%, signal=63%
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS tags=77%, list=20%, signal=62%
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS tags=75%, list=19%, signal=61%
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS tags=77%, list=17%, signal=64%
## core_enrichment
## GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS ENSG00000153789/ENSG00000119508/ENSG00000110944/ENSG00000124256/ENSG00000061656/ENSG00000088340/ENSG00000143297/ENSG00000170476/ENSG00000122188/ENSG00000105369/ENSG00000143603/ENSG00000186827/ENSG00000254709/ENSG00000177455/ENSG00000168081/ENSG00000103257/ENSG00000121101/ENSG00000099958/ENSG00000059804/ENSG00000167476/ENSG00000110777/ENSG00000184588/ENSG00000071575/ENSG00000007312/ENSG00000090339/ENSG00000083454/ENSG00000090104/ENSG00000028277/ENSG00000225783/ENSG00000135604/ENSG00000261371/ENSG00000079263/ENSG00000136573/ENSG00000163568/ENSG00000078081/ENSG00000222041/ENSG00000177272/ENSG00000109790/ENSG00000070190/ENSG00000241106/ENSG00000159618/ENSG00000161031/ENSG00000139193/ENSG00000167077/ENSG00000189233/ENSG00000230006/ENSG00000130775/ENSG00000100368/ENSG00000144815/ENSG00000008513/ENSG00000137265/ENSG00000168421/ENSG00000251301/ENSG00000164691/ENSG00000104972/ENSG00000171310/ENSG00000182022/ENSG00000117090/ENSG00000239713/ENSG00000136490/ENSG00000198624/ENSG00000122862/ENSG00000146094/ENSG00000182871/ENSG00000077984/ENSG00000177989/ENSG00000196839/ENSG00000188596/ENSG00000172578/ENSG00000155926/ENSG00000107742/ENSG00000186891/ENSG00000247774/ENSG00000158517/ENSG00000101445/ENSG00000081052/ENSG00000249096/ENSG00000054967/ENSG00000176788/ENSG00000111335/ENSG00000104894/ENSG00000151651/ENSG00000035720/ENSG00000135074/ENSG00000072694/ENSG00000015285/ENSG00000270164/ENSG00000169442/ENSG00000169508/ENSG00000106415/ENSG00000160932/ENSG00000131196/ENSG00000105371/ENSG00000010671/ENSG00000172716/ENSG00000115165/ENSG00000102879/ENSG00000102760/ENSG00000168685/ENSG00000181458/ENSG00000136286/ENSG00000122122/ENSG00000081237/ENSG00000096996/ENSG00000146112/ENSG00000180353/ENSG00000143119/ENSG00000136167/ENSG00000122986/ENSG00000057657/ENSG00000104814/ENSG00000100055/ENSG00000128218/ENSG00000163219/ENSG00000160255/ENSG00000266094/ENSG00000135736/ENSG00000077420/ENSG00000165178/ENSG00000143851/ENSG00000122591/ENSG00000010810/ENSG00000135842/ENSG00000123329/ENSG00000026751/ENSG00000185811/ENSG00000232810/ENSG00000132704/ENSG00000023902/ENSG00000186517/ENSG00000182487/ENSG00000124508/ENSG00000183486/ENSG00000147889/ENSG00000147065/ENSG00000171488/ENSG00000072818/ENSG00000078589/ENSG00000110876/ENSG00000185905/ENSG00000175463/ENSG00000197872/ENSG00000121895/ENSG00000116824/ENSG00000134516/ENSG00000115935/ENSG00000111348/ENSG00000197043/ENSG00000185862/ENSG00000154589/ENSG00000104998/ENSG00000162894/ENSG00000140030/ENSG00000157303/ENSG00000091592/ENSG00000172349/ENSG00000110324/ENSG00000058091/ENSG00000005844/ENSG00000120280/ENSG00000171700/ENSG00000137078/ENSG00000152804/ENSG00000168918/ENSG00000020633/ENSG00000105122/ENSG00000213654/ENSG00000143515/ENSG00000152689/ENSG00000165272/ENSG00000107099/ENSG00000167123/ENSG00000092621/ENSG00000187688/ENSG00000109113/ENSG00000229228/ENSG00000154102/ENSG00000138964/ENSG00000196154/ENSG00000130755/ENSG00000124772/ENSG00000142227/ENSG00000123338/ENSG00000093072/ENSG00000115355/ENSG00000111886/ENSG00000155307/ENSG00000145088/ENSG00000012779/ENSG00000105329/ENSG00000196628/ENSG00000151702/ENSG00000089327/ENSG00000136732/ENSG00000069424/ENSG00000113532/ENSG00000106605/ENSG00000139278/ENSG00000125354/ENSG00000122224/ENSG00000198832/ENSG00000167895/ENSG00000170571/ENSG00000167286/ENSG00000155849/ENSG00000180096/ENSG00000146192/ENSG00000115085/ENSG00000130702/ENSG00000009790/ENSG00000129657/ENSG00000166501/ENSG00000119729/ENSG00000130303/ENSG00000110848/ENSG00000204516/ENSG00000162511/ENSG00000115295/ENSG00000198879/ENSG00000182866/ENSG00000026025/ENSG00000237499/ENSG00000163644/ENSG00000169826/ENSG00000065413/ENSG00000263961/ENSG00000074657/ENSG00000070882/ENSG00000100629/ENSG00000128294/ENSG00000111860/ENSG00000172236/ENSG00000133574/ENSG00000184451/ENSG00000172183/ENSG00000188677/ENSG00000198851/ENSG00000105374/ENSG00000148516/ENSG00000105974/ENSG00000231721/ENSG00000198873/ENSG00000165168/ENSG00000115232/ENSG00000075884/ENSG00000110852/ENSG00000198771/ENSG00000124067/ENSG00000160469/ENSG00000112293/ENSG00000111245/ENSG00000231389/ENSG00000130592/ENSG00000141293/ENSG00000100365/ENSG00000111674/ENSG00000117091/ENSG00000197535/ENSG00000105426/ENSG00000126860/ENSG00000100097/ENSG00000138166/ENSG00000124813/ENSG00000240505/ENSG00000082074/ENSG00000133874/ENSG00000105851/ENSG00000126264/ENSG00000037042/ENSG00000147443/ENSG00000159110/ENSG00000108798/ENSG00000184293/ENSG00000185669/ENSG00000048462/ENSG00000138378/ENSG00000079308/ENSG00000171502/ENSG00000173762/ENSG00000182158/ENSG00000185880/ENSG00000077943/ENSG00000281103/ENSG00000176438/ENSG00000152256/ENSG00000100628/ENSG00000167851/ENSG00000169554/ENSG00000130005/ENSG00000090554/ENSG00000197253
## DESCARTES_FETAL_ADRENAL_LYMPHOID_CELLS ENSG00000197057/ENSG00000211898/ENSG00000100721/ENSG00000143297/ENSG00000170476/ENSG00000122188/ENSG00000105369/ENSG00000211592/ENSG00000182885/ENSG00000154451/ENSG00000177455/ENSG00000196092/ENSG00000188404/ENSG00000007312/ENSG00000163600/ENSG00000180739/ENSG00000163534/ENSG00000083454/ENSG00000156738/ENSG00000211899/ENSG00000100453/ENSG00000255733/ENSG00000181847/ENSG00000136573/ENSG00000197705/ENSG00000132185/ENSG00000113088/ENSG00000134460/ENSG00000241106/ENSG00000163508/ENSG00000139193/ENSG00000183918/ENSG00000160683/ENSG00000271856/ENSG00000137265/ENSG00000167483/ENSG00000168421/ENSG00000117090/ENSG00000160856/ENSG00000124203/ENSG00000077984/ENSG00000142765/ENSG00000158050/ENSG00000247774/ENSG00000081052/ENSG00000162739/ENSG00000166750/ENSG00000151651/ENSG00000128322/ENSG00000169442/ENSG00000161405/ENSG00000186265/ENSG00000073861/ENSG00000198286/ENSG00000115165/ENSG00000168685/ENSG00000125245/ENSG00000267568/ENSG00000136286/ENSG00000112182/ENSG00000172995/ENSG00000112486/ENSG00000110448/ENSG00000126353/ENSG00000081985/ENSG00000026751/ENSG00000227507/ENSG00000215367/ENSG00000234663/ENSG00000132704/ENSG00000163564/ENSG00000170006/ENSG00000172575/ENSG00000113263/ENSG00000115523/ENSG00000072818/ENSG00000078589/ENSG00000175463/ENSG00000027869/ENSG00000116824/ENSG00000205045/ENSG00000162894/ENSG00000023445/ENSG00000137078/ENSG00000020633/ENSG00000180644/ENSG00000245164/ENSG00000100385/ENSG00000211772/ENSG00000122224/ENSG00000167286/ENSG00000180096/ENSG00000115085/ENSG00000009790/ENSG00000213809/ENSG00000182866/ENSG00000204475/ENSG00000164674/ENSG00000100450/ENSG00000197540/ENSG00000244720/ENSG00000115607/ENSG00000172183/ENSG00000198851/ENSG00000105374/ENSG00000160654/ENSG00000109943/ENSG00000198821/ENSG00000167094/ENSG00000174946/ENSG00000160185/ENSG00000141293/ENSG00000163519/ENSG00000238121/ENSG00000173762/ENSG00000196684/ENSG00000153563/ENSG00000081059/ENSG00000172116
## DESCARTES_FETAL_LUNG_LYMPHOID_CELLS ENSG00000124256/ENSG00000211898/ENSG00000049768/ENSG00000100721/ENSG00000188389/ENSG00000049249/ENSG00000143297/ENSG00000163599/ENSG00000170476/ENSG00000122188/ENSG00000105369/ENSG00000211592/ENSG00000064886/ENSG00000254709/ENSG00000177455/ENSG00000230489/ENSG00000196092/ENSG00000125910/ENSG00000005102/ENSG00000007312/ENSG00000163600/ENSG00000180739/ENSG00000117322/ENSG00000163534/ENSG00000083454/ENSG00000156738/ENSG00000211899/ENSG00000100453/ENSG00000255733/ENSG00000181847/ENSG00000136573/ENSG00000154611/ENSG00000132185/ENSG00000113088/ENSG00000134460/ENSG00000139193/ENSG00000183918/ENSG00000160683/ENSG00000271856/ENSG00000167483/ENSG00000168421/ENSG00000117090/ENSG00000182183/ENSG00000160856/ENSG00000077984/ENSG00000247774/ENSG00000162739/ENSG00000035720/ENSG00000128322/ENSG00000169442/ENSG00000161405/ENSG00000073861/ENSG00000226979/ENSG00000102879/ENSG00000168685/ENSG00000267568/ENSG00000172215/ENSG00000128218/ENSG00000110448/ENSG00000126353/ENSG00000089012/ENSG00000227507/ENSG00000234663/ENSG00000132704/ENSG00000163564/ENSG00000172575/ENSG00000113263/ENSG00000115523/ENSG00000175463/ENSG00000116824/ENSG00000162894/ENSG00000131686/ENSG00000137078/ENSG00000180644/ENSG00000245164/ENSG00000246084/ENSG00000251002/ENSG00000100385/ENSG00000211772/ENSG00000122224/ENSG00000167286/ENSG00000180096/ENSG00000138795/ENSG00000115085/ENSG00000213809/ENSG00000182866/ENSG00000204475/ENSG00000100450/ENSG00000197540/ENSG00000232021/ENSG00000244720/ENSG00000115607/ENSG00000245954/ENSG00000198851/ENSG00000105374/ENSG00000160654/ENSG00000109943/ENSG00000198821/ENSG00000174946/ENSG00000186810/ENSG00000160185/ENSG00000141293/ENSG00000163519/ENSG00000238121/ENSG00000240505/ENSG00000231682/ENSG00000138378/ENSG00000173762/ENSG00000233355/ENSG00000153563/ENSG00000081059/ENSG00000172116/ENSG00000107447/ENSG00000090554/ENSG00000117560/ENSG00000243811/ENSG00000132465/ENSG00000172673
## DESCARTES_FETAL_PANCREAS_LYMPHOID_CELLS ENSG00000162892/ENSG00000101842/ENSG00000111537/ENSG00000167618/ENSG00000211898/ENSG00000049768/ENSG00000100721/ENSG00000172867/ENSG00000188389/ENSG00000122188/ENSG00000105369/ENSG00000240194/ENSG00000211677/ENSG00000188011/ENSG00000064886/ENSG00000211679/ENSG00000177455/ENSG00000187862/ENSG00000196092/ENSG00000110777/ENSG00000137101/ENSG00000007312/ENSG00000163600/ENSG00000163534/ENSG00000083454/ENSG00000156738/ENSG00000012124/ENSG00000211899/ENSG00000102962/ENSG00000244301/ENSG00000158525/ENSG00000181847/ENSG00000136573/ENSG00000255760/ENSG00000132185/ENSG00000113088/ENSG00000134460/ENSG00000163508/ENSG00000139193/ENSG00000205056/ENSG00000186049/ENSG00000183918/ENSG00000168421/ENSG00000182183/ENSG00000160856/ENSG00000183813/ENSG00000077984/ENSG00000187621/ENSG00000128322/ENSG00000179253/ENSG00000161405/ENSG00000253522/ENSG00000186265/ENSG00000073861/ENSG00000125245/ENSG00000267568/ENSG00000112182/ENSG00000128218/ENSG00000110448/ENSG00000126353/ENSG00000089012/ENSG00000234663/ENSG00000132704/ENSG00000163564/ENSG00000113263/ENSG00000115523/ENSG00000078589/ENSG00000175463/ENSG00000027869/ENSG00000116824/ENSG00000162894/ENSG00000137078/ENSG00000013725/ENSG00000180644/ENSG00000245164/ENSG00000115602/ENSG00000246084/ENSG00000100385/ENSG00000211780/ENSG00000211772/ENSG00000135426/ENSG00000167286/ENSG00000180096/ENSG00000138795/ENSG00000115085/ENSG00000211778/ENSG00000110848/ENSG00000213809/ENSG00000182866/ENSG00000204475/ENSG00000100450/ENSG00000197540/ENSG00000232021/ENSG00000147138/ENSG00000244720/ENSG00000245954/ENSG00000198851/ENSG00000105374/ENSG00000107485/ENSG00000160654/ENSG00000109943/ENSG00000198821/ENSG00000167094/ENSG00000174946/ENSG00000204165/ENSG00000141293/ENSG00000163519/ENSG00000238121/ENSG00000240505/ENSG00000184293/ENSG00000231682/ENSG00000235576/ENSG00000233355/ENSG00000153563/ENSG00000081059/ENSG00000172116/ENSG00000107447/ENSG00000229153/ENSG00000117560/ENSG00000257495/ENSG00000102245
## DESCARTES_FETAL_HEART_LYMPHOID_CELLS ENSG00000050730/ENSG00000049768/ENSG00000100721/ENSG00000188389/ENSG00000143297/ENSG00000211592/ENSG00000064886/ENSG00000177455/ENSG00000196092/ENSG00000110777/ENSG00000188404/ENSG00000007312/ENSG00000163600/ENSG00000117322/ENSG00000163534/ENSG00000156738/ENSG00000100453/ENSG00000255733/ENSG00000181847/ENSG00000136573/ENSG00000132185/ENSG00000113088/ENSG00000139193/ENSG00000205056/ENSG00000183918/ENSG00000271856/ENSG00000120659/ENSG00000117090/ENSG00000182183/ENSG00000160856/ENSG00000077984/ENSG00000162739/ENSG00000128322/ENSG00000169442/ENSG00000161405/ENSG00000073861/ENSG00000198286/ENSG00000226979/ENSG00000172215/ENSG00000128218/ENSG00000110448/ENSG00000126353/ENSG00000089012/ENSG00000234663/ENSG00000132704/ENSG00000147889/ENSG00000172575/ENSG00000113263/ENSG00000115523/ENSG00000072818/ENSG00000254838/ENSG00000162894/ENSG00000137078/ENSG00000180644/ENSG00000245164/ENSG00000246084/ENSG00000100385/ENSG00000211772/ENSG00000122224/ENSG00000167286/ENSG00000180096/ENSG00000115085/ENSG00000213809/ENSG00000182866/ENSG00000204475/ENSG00000100450/ENSG00000197540/ENSG00000232021/ENSG00000115607/ENSG00000172183/ENSG00000105374/ENSG00000160654/ENSG00000109943/ENSG00000174946/ENSG00000186810/ENSG00000160185/ENSG00000141293/ENSG00000163519/ENSG00000238121/ENSG00000240505/ENSG00000231682/ENSG00000173762/ENSG00000081059/ENSG00000172116/ENSG00000162645
We can see that indeed the first cell type signature gene sets is GAO_LARGE_INTESTINE_ADULT_CI_MESENCHYMAL_CELLS which is concordant since the biopsies have been collected from that area of the human body.
Amélioration : pas sûr DU TOUT de ma conclusion. surtout que le numéro 2 c’est un rein qui revient quoi.
Most Negative NES
Let’s look for the 5 gene sets with the most negative NES.
gsea_result_df_C8 %>%
# Return the 5 rows with the smallest (most negative) NES values
dplyr::slice_min(NES, n = 5)
## ID
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE
## Description
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE
## setSize enrichmentScore
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES 176 -0.6520973
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES 112 -0.6518687
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS 254 -0.5643728
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES 255 -0.5575105
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE 56 -0.6820471
## NES pvalue
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES -2.912169 1.170995e-26
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES -2.739415 3.156376e-16
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS -2.646039 1.705066e-24
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES -2.583055 1.326081e-23
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE -2.573482 6.691017e-10
## p.adjust qvalues
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES 1.120252e-24 4.519630e-25
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES 6.247449e-15 2.520518e-15
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS 1.087453e-22 4.387304e-23
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES 6.919730e-22 2.791749e-22
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE 4.465865e-09 1.801743e-09
## rank
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES 3524
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES 4508
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS 3469
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES 4201
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE 4536
## leading_edge
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES tags=54%, list=11%, signal=48%
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES tags=59%, list=15%, signal=51%
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS tags=50%, list=11%, signal=45%
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES tags=44%, list=14%, signal=38%
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE tags=57%, list=15%, signal=49%
## core_enrichment
## BUSSLINGER_DUODENAL_LATE_IMMATURE_ENTEROCYTES ENSG00000163686/ENSG00000144063/ENSG00000108242/ENSG00000106404/ENSG00000084234/ENSG00000143891/ENSG00000163399/ENSG00000197142/ENSG00000172380/ENSG00000189334/ENSG00000089220/ENSG00000105398/ENSG00000013583/ENSG00000165092/ENSG00000119125/ENSG00000151715/ENSG00000163694/ENSG00000069849/ENSG00000099812/ENSG00000138029/ENSG00000134716/ENSG00000084754/ENSG00000164749/ENSG00000167996/ENSG00000143198/ENSG00000079112/ENSG00000162482/ENSG00000005022/ENSG00000211454/ENSG00000004799/ENSG00000145384/ENSG00000060971/ENSG00000100889/ENSG00000147872/ENSG00000166126/ENSG00000117054/ENSG00000111684/ENSG00000106853/ENSG00000145703/ENSG00000124839/ENSG00000142484/ENSG00000166959/ENSG00000198189/ENSG00000167315/ENSG00000171431/ENSG00000165475/ENSG00000168903/ENSG00000138109/ENSG00000106688/ENSG00000088256/ENSG00000166866/ENSG00000099834/ENSG00000113303/ENSG00000120915/ENSG00000075239/ENSG00000198074/ENSG00000164237/ENSG00000111802/ENSG00000197888/ENSG00000171747/ENSG00000189221/ENSG00000162545/ENSG00000172831/ENSG00000080493/ENSG00000213996/ENSG00000139540/ENSG00000112818/ENSG00000108187/ENSG00000196502/ENSG00000007216/ENSG00000241224/ENSG00000172367/ENSG00000161533/ENSG00000278535/ENSG00000114113/ENSG00000104267/ENSG00000091138/ENSG00000148671/ENSG00000086696/ENSG00000278505/ENSG00000160868/ENSG00000060566/ENSG00000163586/ENSG00000135220/ENSG00000122121/ENSG00000166869/ENSG00000249948/ENSG00000124253/ENSG00000166825/ENSG00000118777/ENSG00000174358/ENSG00000134240/ENSG00000141434/ENSG00000172689/ENSG00000125255
## BUSSLINGER_DUODENAL_EARLY_IMMATURE_ENTEROCYTES ENSG00000213585/ENSG00000152234/ENSG00000176340/ENSG00000112695/ENSG00000164405/ENSG00000241837/ENSG00000171766/ENSG00000127184/ENSG00000198668/ENSG00000154723/ENSG00000131143/ENSG00000165629/ENSG00000050405/ENSG00000140740/ENSG00000179091/ENSG00000131174/ENSG00000163399/ENSG00000170421/ENSG00000165678/ENSG00000138413/ENSG00000089220/ENSG00000013583/ENSG00000154518/ENSG00000165092/ENSG00000164919/ENSG00000121691/ENSG00000110955/ENSG00000111775/ENSG00000084754/ENSG00000178741/ENSG00000135940/ENSG00000131981/ENSG00000167996/ENSG00000163541/ENSG00000143198/ENSG00000102554/ENSG00000005022/ENSG00000145384/ENSG00000100889/ENSG00000250479/ENSG00000106853/ENSG00000171431/ENSG00000165475/ENSG00000243955/ENSG00000132437/ENSG00000198074/ENSG00000164237/ENSG00000197888/ENSG00000171747/ENSG00000189221/ENSG00000172831/ENSG00000080493/ENSG00000104419/ENSG00000108187/ENSG00000244067/ENSG00000278535/ENSG00000114113/ENSG00000104267/ENSG00000091138/ENSG00000086696/ENSG00000248144/ENSG00000163586/ENSG00000166869/ENSG00000249948/ENSG00000166825/ENSG00000134240
## DESCARTES_FETAL_INTESTINE_INTESTINAL_EPITHELIAL_CELLS ENSG00000108242/ENSG00000156463/ENSG00000113722/ENSG00000130545/ENSG00000163362/ENSG00000164406/ENSG00000102890/ENSG00000197142/ENSG00000095539/ENSG00000178826/ENSG00000009765/ENSG00000257702/ENSG00000189334/ENSG00000105398/ENSG00000010438/ENSG00000179674/ENSG00000119431/ENSG00000164362/ENSG00000119125/ENSG00000151715/ENSG00000251429/ENSG00000207713/ENSG00000225969/ENSG00000187658/ENSG00000212496/ENSG00000116771/ENSG00000162069/ENSG00000131981/ENSG00000105289/ENSG00000173597/ENSG00000184307/ENSG00000173237/ENSG00000139835/ENSG00000130701/ENSG00000109193/ENSG00000169562/ENSG00000168237/ENSG00000229005/ENSG00000237424/ENSG00000231013/ENSG00000164251/ENSG00000145384/ENSG00000100889/ENSG00000166126/ENSG00000114455/ENSG00000147804/ENSG00000171174/ENSG00000166391/ENSG00000156006/ENSG00000229719/ENSG00000118094/ENSG00000204978/ENSG00000227070/ENSG00000137960/ENSG00000171431/ENSG00000106384/ENSG00000245293/ENSG00000168903/ENSG00000081051/ENSG00000165841/ENSG00000138109/ENSG00000243955/ENSG00000106688/ENSG00000225329/ENSG00000176919/ENSG00000203645/ENSG00000232070/ENSG00000161653/ENSG00000137251/ENSG00000144852/ENSG00000039537/ENSG00000167117/ENSG00000169715/ENSG00000182327/ENSG00000184434/ENSG00000197888/ENSG00000114812/ENSG00000240045/ENSG00000172831/ENSG00000112818/ENSG00000176532/ENSG00000108187/ENSG00000119227/ENSG00000249231/ENSG00000137825/ENSG00000168748/ENSG00000244067/ENSG00000236384/ENSG00000172782/ENSG00000224822/ENSG00000130055/ENSG00000174992/ENSG00000135917/ENSG00000170439/ENSG00000224057/ENSG00000178462/ENSG00000163295/ENSG00000114113/ENSG00000243550/ENSG00000146039/ENSG00000186198/ENSG00000125144/ENSG00000086696/ENSG00000248144/ENSG00000060566/ENSG00000163586/ENSG00000135220/ENSG00000122121/ENSG00000066230/ENSG00000164825/ENSG00000044012/ENSG00000182271/ENSG00000197273/ENSG00000166869/ENSG00000170827/ENSG00000148942/ENSG00000165828/ENSG00000205358/ENSG00000267709/ENSG00000124253/ENSG00000166825/ENSG00000256612/ENSG00000174358/ENSG00000138308/ENSG00000081800/ENSG00000163959/ENSG00000216560
## BUSSLINGER_DUODENAL_MATURE_ENTEROCYTES ENSG00000160712/ENSG00000185000/ENSG00000072778/ENSG00000120756/ENSG00000173638/ENSG00000187837/ENSG00000127948/ENSG00000124299/ENSG00000162458/ENSG00000101782/ENSG00000102882/ENSG00000080031/ENSG00000090020/ENSG00000144063/ENSG00000108242/ENSG00000130762/ENSG00000136826/ENSG00000184163/ENSG00000095066/ENSG00000110628/ENSG00000197142/ENSG00000138678/ENSG00000008517/ENSG00000158201/ENSG00000136059/ENSG00000189334/ENSG00000177106/ENSG00000179674/ENSG00000105357/ENSG00000162769/ENSG00000185561/ENSG00000133328/ENSG00000070961/ENSG00000106211/ENSG00000099812/ENSG00000065618/ENSG00000012171/ENSG00000002726/ENSG00000148343/ENSG00000176092/ENSG00000166145/ENSG00000136068/ENSG00000173237/ENSG00000160685/ENSG00000154269/ENSG00000143167/ENSG00000108846/ENSG00000169692/ENSG00000128298/ENSG00000167600/ENSG00000145384/ENSG00000060971/ENSG00000100889/ENSG00000147872/ENSG00000166126/ENSG00000114455/ENSG00000147804/ENSG00000141574/ENSG00000124839/ENSG00000142484/ENSG00000125046/ENSG00000166959/ENSG00000074416/ENSG00000143375/ENSG00000142949/ENSG00000118094/ENSG00000138109/ENSG00000138792/ENSG00000167114/ENSG00000166866/ENSG00000176919/ENSG00000099834/ENSG00000232070/ENSG00000014914/ENSG00000108352/ENSG00000187699/ENSG00000156381/ENSG00000182327/ENSG00000123643/ENSG00000224259/ENSG00000162545/ENSG00000076351/ENSG00000172831/ENSG00000213996/ENSG00000186204/ENSG00000233041/ENSG00000151150/ENSG00000007216/ENSG00000241224/ENSG00000172367/ENSG00000161533/ENSG00000085552/ENSG00000185133/ENSG00000163295/ENSG00000091138/ENSG00000146039/ENSG00000148671/ENSG00000197165/ENSG00000160868/ENSG00000060566/ENSG00000188242/ENSG00000122121/ENSG00000186115/ENSG00000165828/ENSG00000124253/ENSG00000166825/ENSG00000118777/ENSG00000256612/ENSG00000174358/ENSG00000141434/ENSG00000172689
## GAO_LARGE_INTESTINE_24W_C10_ENTEROCYTE ENSG00000176945/ENSG00000177943/ENSG00000188747/ENSG00000185000/ENSG00000120756/ENSG00000073792/ENSG00000159166/ENSG00000109854/ENSG00000144063/ENSG00000144426/ENSG00000130958/ENSG00000010438/ENSG00000151715/ENSG00000002726/ENSG00000136052/ENSG00000166391/ENSG00000243955/ENSG00000253958/ENSG00000166866/ENSG00000099834/ENSG00000144852/ENSG00000007306/ENSG00000168748/ENSG00000012504/ENSG00000259823/ENSG00000044012/ENSG00000197273/ENSG00000165828/ENSG00000166825/ENSG00000174358/ENSG00000081800/ENSG00000103375
The TOP5 gene sets with the most negative NES are related to the intestinal epithelium.
Enterocytes are one of the four main cell types of the intestinal epithelium
Hope to see you soon
Chaimae EL HOUJJAJI